CIE: Add an SSE2 version of "RGBA float" to "CIE Lab alpha float"

author Debarshi Ray <debarshir@gnome.org>

Sat, 28 Apr 2018 23:15:46 +0000 (01:15 +0200)

committer Debarshi Ray <debarshir@gnome.org>

Mon, 14 May 2018 12:52:32 +0000 (14:52 +0200)
author Debarshi Ray <debarshir@gnome.org>
Sat, 28 Apr 2018 23:15:46 +0000 (01:15 +0200)
committer Debarshi Ray <debarshir@gnome.org>
Mon, 14 May 2018 12:52:32 +0000 (14:52 +0200)
diff --git a/extensions/CIE.c b/extensions/CIE.c

index cb70d0f023398a6c7a43590e2b27c40597b975f3..c8a79b90523601d876f608821b290cf3a8d57bfb 100644 (file)
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -2,7 +2,7 @@
   * Copyright (C) 2005, 2014 Øyvind Kolås.
   * Copyright (C) 2009, Martin Nordholts
   * Copyright (C) 2014, Elle Stone
- * Copyright (C) 2017, Red Hat, Inc.
+ * Copyright (C) 2017, 2018 Red Hat, Inc.
   *
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
@@ -21,8 +21,13 @@
  
  #include "config.h"
  #include <math.h>
+#include <stdint.h>
  #include <string.h>
  
+#if defined(USE_SSE2)
+#include <emmintrin.h>
+#endif /* defined(USE_SSE2) */
+
  #include "babl-internal.h"
  #include "extensions/util.h"
  
@@ -557,8 +562,6 @@ lchaba_to_rgba (const Babl *conversion,char *src,
   * Return cube root of x
   */
  
-#include <stdint.h>
-
  static inline float
  _cbrtf (float x)
  {
@@ -1034,6 +1037,204 @@ Lchabaf_to_Labaf (const Babl *conversion,float *src,
      }
  }
  
+#if defined(USE_SSE2)
+
+/* This is an SSE2 version of Halley's method for approximating the
+ * cube root of an IEEE float implementation.
+ *
+ * The scalar version is as follows:
+ *
+ * static inline float
+ * _cbrt_5f (float x)
+ * {
+ *   union { float f; uint32_t i; } u = { x };
+ *
+ *   u.i = u.i / 3 + 709921077;
+ *   return u.f;
+ * }
+ *
+ * static inline float
+ * _cbrta_halleyf (float a, float R)
+ * {
+ *   float a3 = a * a * a;
+ *   float b = a * (a3 + R + R) / (a3 + a3 + R);
+ *   return b;
+ * }
+ *
+ * static inline float
+ * _cbrtf (float x)
+ * {
+ *   float a;
+ *
+ *   a = _cbrt_5f (x);
+ *   a = _cbrta_halleyf (a, x);
+ *   a = _cbrta_halleyf (a, x);
+ *   return a;
+ * }
+ *
+ * The above scalar version seems to have originated from
+ * http://metamerist.com/cbrt/cbrt.htm but that's not accessible
+ * anymore. At present there's a copy in CubeRoot.cpp in the Skia
+ * sources that's licensed under a BSD-style license. There's some
+ * discussion on the implementation at
+ * http://www.voidcn.com/article/p-gpwztojr-wt.html.
+ *
+ * Note that Darktable also has an SSE2 version of the same algorithm,
+ * but uses only a single iteration of Halley's method, which is too
+ * coarse.
+ */
+/* Return cube roots of the four single-precision floating point
+ * components of x.
+ */
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  const __m128i magic = _mm_set1_epi32 (709921077);
+
+  __m128i xi = _mm_castps_si128 (x);
+  __m128 xi_3 = _mm_div_ps (_mm_cvtepi32_ps (xi), _mm_set1_ps (3.0f));
+  __m128i ai = _mm_add_epi32 (_mm_cvtps_epi32 (xi_3), magic);
+  __m128 a = _mm_castsi128_ps (ai);
+
+  __m128 a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  __m128 divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  return a;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+  return f;
+}
+
+static void
+rgbaf_to_Labaf_sse2 (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __m128 m_0_0_v = _mm_set1_ps (m_0_0);
+      const __m128 m_0_1_v = _mm_set1_ps (m_0_1);
+      const __m128 m_0_2_v = _mm_set1_ps (m_0_2);
+      const __m128 m_1_0_v = _mm_set1_ps (m_1_0);
+      const __m128 m_1_1_v = _mm_set1_ps (m_1_1);
+      const __m128 m_1_2_v = _mm_set1_ps (m_1_2);
+      const __m128 m_2_0_v = _mm_set1_ps (m_2_0);
+      const __m128 m_2_1_v = _mm_set1_ps (m_2_1);
+      const __m128 m_2_2_v = _mm_set1_ps (m_2_2);
+
+      for ( ; i < n; i += 4)
+        {
+          __m128 Laba0;
+          __m128 Laba1;
+          __m128 Laba2;
+          __m128 Laba3;
+
+          __m128 rgba0 = _mm_load_ps (src);
+          __m128 rgba1 = _mm_load_ps (src + 4);
+          __m128 rgba2 = _mm_load_ps (src + 8);
+          __m128 rgba3 = _mm_load_ps (src + 12);
+
+          __m128 r = rgba0;
+          __m128 g = rgba1;
+          __m128 b = rgba2;
+          __m128 a = rgba3;
+          _MM_TRANSPOSE4_PS (r, g, b, a);
+
+          {
+            __m128 xr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_0_0_v, r), _mm_mul_ps (m_0_1_v, g)),
+                                    _mm_mul_ps (m_0_2_v, b));
+            __m128 yr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_1_0_v, r), _mm_mul_ps (m_1_1_v, g)),
+                                    _mm_mul_ps (m_1_2_v, b));
+            __m128 zr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_2_0_v, r), _mm_mul_ps (m_2_1_v, g)),
+                                    _mm_mul_ps (m_2_2_v, b));
+
+            __m128 fx = lab_r_to_f_sse2 (xr);
+            __m128 fy = lab_r_to_f_sse2 (yr);
+            __m128 fz = lab_r_to_f_sse2 (zr);
+
+            __m128 L = _mm_sub_ps (_mm_mul_ps (_mm_set1_ps (116.0f), fy), _mm_set1_ps (16.0f));
+            __m128 A = _mm_mul_ps (_mm_set1_ps (500.0f), _mm_sub_ps (fx, fy));
+            __m128 B = _mm_mul_ps (_mm_set1_ps (200.0f), _mm_sub_ps (fy, fz));
+
+            Laba0 = L;
+            Laba1 = A;
+            Laba2 = B;
+            Laba3 = a;
+            _MM_TRANSPOSE4_PS (Laba0, Laba1, Laba2, Laba3);
+          }
+
+          _mm_store_ps (dst, Laba0);
+          _mm_store_ps (dst + 4, Laba1);
+          _mm_store_ps (dst + 8, Laba2);
+          _mm_store_ps (dst + 12, Laba3);
+
+          src += 16;
+          dst += 16;
+        }
+    }
+
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE2) */
+
  static void
  conversions (void)
  {
@@ -1200,6 +1401,20 @@ conversions (void)
      NULL
    );
  
+#if defined(USE_SSE2)
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)
+    {
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE Lab alpha float"),
+        "linear", rgbaf_to_Labaf_sse2,
+        NULL
+      );
+    }
+
+#endif /* defined(USE_SSE2) */
+
    rgbcie_init ();
  }
  
diff --git a/extensions/Makefile.am b/extensions/Makefile.am

index a2a2943677f65f411f7604b97efb5b159a50a283..a066e8d1974c62a62e4a3262351e0863a5febce7 100644 (file)
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -71,6 +71,7 @@ fast_float_la_SOURCES = fast-float.c
  LIBS =  $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la \
         $(MATH_LIB) $(THREAD_LIB)
  
+CIE_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
author	Debarshi Ray <debarshir@gnome.org>
	Sat, 28 Apr 2018 23:15:46 +0000 (01:15 +0200)
committer	Debarshi Ray <debarshir@gnome.org>
	Mon, 14 May 2018 12:52:32 +0000 (14:52 +0200)
extensions/CIE.c		patch \| blob \| history
extensions/Makefile.am		patch \| blob \| history